# General Imports
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import glob
import folium
import json
import branca.colormap as cmp
import plotly.graph_objects as go
from datetime import datetime
from copy import deepcopy
import plotly.offline as pyo
pyo.init_notebook_mode()
history_data = pd.read_csv('data/EV_Data/Electric_Vehicle_Population_Size_History_By_County.csv')
history_data.head()
| Date | County | State | Vehicle Primary Use | Battery Electric Vehicles (BEVs) | Plug-In Hybrid Electric Vehicles (PHEVs) | Electric Vehicle (EV) Total | Non-Electric Vehicle Total | Total Vehicles | Percent Electric Vehicles | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | October 31 2019 | Flathead | MT | Passenger | 1 | 0 | 1 | 70 | 71 | 1.41 |
| 1 | October 31 2019 | New London | CT | Passenger | 0 | 2 | 2 | 244 | 246 | 0.81 |
| 2 | June 30 2019 | Pend Oreille | WA | Truck | 0 | 0 | 0 | 5730 | 5730 | 0.00 |
| 3 | November 30 2019 | Virginia Beach | VA | Passenger | 1 | 0 | 1 | 706 | 707 | 0.14 |
| 4 | February 28 2018 | Sacramento | CA | Passenger | 1 | 0 | 1 | 307 | 308 | 0.32 |
vehicle_data = pd.read_csv('data/EV_Data/Electric_Vehicle_Population_Data.csv')
vehicle_data.head()
| VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5YJYGDEF5L | Thurston | Lacey | WA | 98516.0 | 2020 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 291 | 0 | 22.0 | 124535071 | POINT (-122.7474291 47.0821119) | PUGET SOUND ENERGY INC | 5.306701e+10 |
| 1 | 1N4BZ1CP1K | King | Sammamish | WA | 98074.0 | 2019 | NISSAN | LEAF | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 150 | 0 | 45.0 | 102359449 | POINT (-122.0313266 47.6285782) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.303303e+10 |
| 2 | 5YJXCDE28G | King | Kent | WA | 98031.0 | 2016 | TESLA | MODEL X | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 200 | 0 | 33.0 | 228682037 | POINT (-122.2012521 47.3931814) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.303303e+10 |
| 3 | JHMZC5F37M | Kitsap | Poulsbo | WA | 98370.0 | 2021 | HONDA | CLARITY | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 47 | 0 | 23.0 | 171566447 | POINT (-122.64177 47.737525) | PUGET SOUND ENERGY INC | 5.303509e+10 |
| 4 | WA1F2AFY4P | Thurston | Olympia | WA | 98501.0 | 2023 | AUDI | Q5 E | Plug-in Hybrid Electric Vehicle (PHEV) | Not eligible due to low battery range | 23 | 0 | 22.0 | 234923230 | POINT (-122.89692 47.043535) | PUGET SOUND ENERGY INC | 5.306701e+10 |
us_outlines = {}
# Read in all geo
for geo_json in glob.glob('data/Geo_Data/*.json'):
with open(geo_json, 'r', encoding='utf-8', errors='ignore') as geo_data:
us_outlines[os.path.basename(geo_json)[:-5]] = (json.load(geo_data))
To start, we must drop any rows with missing (NaN) data.
# Drop nan's
history_data.dropna(inplace=True)
print(f'Num of history_data NaNs: %d' % sum([history_data[col].isna().sum() for col in history_data.columns]))
vehicle_data.dropna(inplace=True)
print(f'Num of vehicle_data NaNs: %d' % sum([vehicle_data[col].isna().sum() for col in vehicle_data.columns]))
Num of history_data NaNs: 0 Num of vehicle_data NaNs: 0
The GeoJson datasets contain geo data on all relevant locations in the U.S. We must extract the counties specific to Washington for our analysis. The GeoJson for the counties uses the state number to represent the states, but our main datasets don't use this. We will need to extract this value from the states GeoJson, then use this id to extract only the counties in Washington.
# Get Washington's state id
for state in us_outlines['states']['features']:
if state['properties']['NAME'] == 'Washington':
wa_id = state['properties']['STATE']
break
print(f'Washington State ID: {wa_id}')
Washington State ID: 53
# Get geo data of all counties in washington
wa_counties = []
for county in us_outlines['counties']['features']:
if county['properties']['STATE'] == '53':
wa_counties.append(county)
wa_county_outlines = deepcopy(us_outlines['counties'])
wa_county_outlines['features'] = wa_counties
print(f'Number of counties: %d' % len(wa_counties))
Number of counties: 39
Each row of vehicle_data is an individual vehicle with information on where it resides. A new dataframe must be made to hold the numerical counts of EVs in general per county to use in our choropleths
# Get count of ev's in each county
total_ev_per_county = pd.DataFrame(vehicle_data['County'].value_counts())
total_ev_per_county.rename(columns={'County': 'Count'}, inplace=True)
total_ev_per_county.head()
| Count | |
|---|---|
| King | 84940 |
| Snohomish | 19012 |
| Pierce | 12575 |
| Clark | 9595 |
| Thurston | 5880 |
We will transform the date in history_data into DateTime to extract date/time parts more easily.
import calendar
# Get month name to number mapper
month_mapper = {month_name: month_num for month_num, month_name in enumerate(calendar.month_name)}
month_mapper.pop('')
month_mapper
# Transform date string (if it is a string) to datetime object
if pd.api.types.is_string_dtype(history_data['Date'].dtype):
history_data['Date'] = history_data['Date'].str.split().apply(lambda x: datetime.strptime(
'/'.join([str(month_mapper[x[0]])] + x[1:]),
'%m/%d/%Y'
))
1-10 VIN is not unique for each car, making it a poor option to differentiate the cars, so we'll need a column to store the absolute index of each row in the original dataset to avoid index shifts when extracting columns, etc.
The coordinates of each vehicle in vehicle_data are Point objects. Without converting to shapely object, we will instead convert them into 2 element lists.
# Store indices to a column
vehicle_data['Index'] = vehicle_data.index
# Convert POINT(x,y) object string to [x,y] array
import re
vehicle_data['Vehicle Location'] = vehicle_data['Vehicle Location'].apply(lambda x: [float(coord)
for coord in re.findall('POINT\s\((.*)\)', x)[0].split()])
vehicle_data.head(2)
| VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | Index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5YJYGDEF5L | Thurston | Lacey | WA | 98516.0 | 2020 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 291 | 0 | 22.0 | 124535071 | [-122.7474291, 47.0821119] | PUGET SOUND ENERGY INC | 5.306701e+10 | 0 |
| 1 | 1N4BZ1CP1K | King | Sammamish | WA | 98074.0 | 2019 | NISSAN | LEAF | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 150 | 0 | 45.0 | 102359449 | [-122.0313266, 47.6285782] | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.303303e+10 | 1 |
Convert SVG images to png
The data for each county can be assumed to be approximately the population data as the dataset source is described to show all the EVs "that are currently registered through Washington State Department of Licensing (DOL)".
# Get years and count number of vehicles registered **per** year and per month in each year (seperately)
years = list(dict(history_data['Date'].dt.year.value_counts()).keys())
register_per_year = {}
year_month_count = {}
year_month_cont = {f'{year}/{month}': 0 for year in sorted(years) for month in range(1, 13)} # all months continuous
for year in sorted(years):
# Year count
year_rows = history_data[history_data['Date'].dt.year == year]
register_per_year[year] = year_rows[year_rows['Date'].dt.month == (12 if year != 2023 else 11)]['Electric Vehicle (EV) Total'].sum()
# Monthly count seperated into years
for index, row in year_rows.iterrows():
month = row['Date'].month
month_ev = int(row['Electric Vehicle (EV) Total'])
if year not in year_month_count:
year_month_count[year] = {m: 0 for m in range(1, 13)}
year_month_count[year][month] += month_ev
year_month_cont[f'{year}/{month}'] += month_ev
# Compute cumulative sum of vehicle registered in each year
cum_year_count = {}
cumsum = 0
for year in register_per_year:
cum_year_count[year] = cumsum + register_per_year[year]
cumsum = cum_year_count[year]
# Plot Cumulative sum of vehicle registered in each year
fig = go.Figure()
fig.add_traces(go.Scatter(x=list(cum_year_count.keys()),
y=list(cum_year_count.values())))
fig.update_layout(title_text='Cum. # of EVs Registered with Washington Up to 2023')
fig.show()
# Compute change in count of vehicle registration per year
annual_registration_change = {}
prev_year = ('0000', 0)
for year in register_per_year:
annual_registration_change[f'{prev_year[0]} to {year}'] = register_per_year[year] - prev_year[1]
prev_year = (year, register_per_year[year])
# Plot months in all years on same plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(year_month_cont.keys()),
y=list(year_month_cont.values())))
fig.update_layout(title_text='# of EVs Registered Each Month In 2017 to 2023')
fig.show()
fig = go.Figure()
for year in year_month_count:
fig.add_traces(go.Scatter(x=list(year_month_count[year].keys()),
y=list(year_month_count[year].values()),
name=year))
fig.update_layout(title_text='# of EVs Registered Each Month (Seperated) In 2017 to 2023')
fig.show()
Let's also look at the overall statistics and distribution of the means of eahc month. Recall this dataset is representative of the population of washington, so we can consider them to be the population mean, variance, and standard deviation ($\mu$, $\sigma^2$, $\sigma$), etc..
def compute_vis_month_stat(year_month_count):
'''
Computes the mean, variance, and standard deviation and plots a box plot and distribution plot for them.
'''
# Collect counts of each year index by months (- 1) instead
month_count_list = {month: [] for month in range(1, 13)}
for year in year_month_count:
[month_count_list[month].append(year_month_count[year][month]) for month in month_count_list]
# Compute mean and variance per year
months_mv = {month: (np.mean(month_count_list[month]),
np.var(month_count_list[month]),
np.std(month_count_list[month])) for month in month_count_list}
# Plot box plot of the distribution of Mean count of each month
fig = go.Figure()
for month in month_count_list:
fig.add_trace(go.Violin(y=month_count_list[month],
name=month,
fillcolor='red',
line_color='black',
opacity=0.5,
box_visible=True))
fig.update_layout(showlegend=False,
title_text='Distribution of the Mean Count of Each Month')
fig.show()
print("Month Stats\n", pd.DataFrame(months_mv).rename(index={0: 'Mean', 1: 'Var', 2: 'Std'}))
# Compute and visualize fixed year month count stats
compute_vis_month_stat(year_month_count)
Month Stats
1 2 3 4 5 \
Mean 6.083129e+04 6.204314e+04 6.332943e+04 6.509371e+04 6.658186e+04
Var 1.014911e+09 1.065476e+09 1.122115e+09 1.203650e+09 1.279125e+09
Std 3.185766e+04 3.264163e+04 3.349799e+04 3.469366e+04 3.576485e+04
6 7 8 9 10 \
Mean 6.820886e+04 7.012743e+04 7.210943e+04 7.359943e+04 7.583986e+04
Var 1.361241e+09 1.449161e+09 1.586474e+09 1.666882e+09 1.763561e+09
Std 3.689500e+04 3.806785e+04 3.983056e+04 4.082747e+04 4.199478e+04
11 12
Mean 7.758057e+04 5.591071e+04
Var 1.821080e+09 1.247413e+09
Std 4.267411e+04 3.531873e+04
OBSOLETE There is a clear indication that 2023 is an abnormality in the trend in general from the plot; however, December is completely missing data, which is also drastically affecting our $\mu$ and $\sigma^2$ and by extention $\sigma$. We will need to fill in this value. To do this, the best we can do is analyze the trend and obtain the average change per month throughout the years between November and December, excluding 2023. Using the mean change, we can approximate the most likely scenario for December given the registration count we had for November.
# Calculate the mean change from Nov to Dec
from math import ceil, floor
n_d_changes = [year_month_count[year][12] - year_month_count[year][11] for year in year_month_count if year != 2023]
mean_n_d_change = np.mean(n_d_changes)
mean_n_d_change = ceil(mean_n_d_change)if mean_n_d_change >= 0 else floor(mean_n_d_change)
# Approximate December 2023
year_month_count_fix = deepcopy(year_month_count)
year_month_count_fix[2023][12] = year_month_count_fix[2023][11] + mean_n_d_change
dec_2023 = year_month_count_fix[2023][12]
print(f'Mean Change From Nov to Dec Each Year (excluding 2023): {mean_n_d_change}')
print(f'Filled Approximate of Dec Registrations: {dec_2023}')
Mean Change From Nov to Dec Each Year (excluding 2023): 1359 Filled Approximate of Dec Registrations: 161198
# Compute and visualize fixed year month count stats
compute_vis_month_stat(year_month_count_fix)
Month Stats
1 2 3 4 5 \
Mean 6.083129e+04 6.204314e+04 6.332943e+04 6.509371e+04 6.658186e+04
Var 1.014911e+09 1.065476e+09 1.122115e+09 1.203650e+09 1.279125e+09
Std 3.185766e+04 3.264163e+04 3.349799e+04 3.469366e+04 3.576485e+04
6 7 8 9 10 \
Mean 6.820886e+04 7.012743e+04 7.210943e+04 7.359943e+04 7.583986e+04
Var 1.361241e+09 1.449161e+09 1.586474e+09 1.666882e+09 1.763561e+09
Std 3.689500e+04 3.806785e+04 3.983056e+04 4.082747e+04 4.199478e+04
11 12
Mean 7.758057e+04 7.893900e+04
Var 1.821080e+09 1.854169e+09
Std 4.267411e+04 4.306006e+04
Now we have the statistics without missing data, where December is approximated using previous year's trends, let's go ahead and and re-plot the change bar plot and count scatter plots
# Deep copy count dicts
cum_year_count_fix = deepcopy(cum_year_count)
annual_registration_change_fix = deepcopy(annual_registration_change)
year_month_cont_fix = deepcopy(year_month_cont)
# Fix count dicts to include December
cum_year_count_fix[2023] += dec_2023
annual_registration_change_fix['2022 to 2023'] += dec_2023
# Fix continuous count to include December
year_month_cont_fix['2023/12'] += dec_2023
# Plot Cumulative sum of vehicle registered in each year
fig = go.Figure()
fig.add_traces(go.Scatter(x=list(cum_year_count_fix.keys()),
y=list(cum_year_count_fix.values())))
fig.update_layout(title_text='Cum. # of EVs Registered with Washington Up to 2023')
fig.show()
# Plot months in all years on same plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(year_month_cont_fix.keys()),
y=list(year_month_cont_fix.values())))
fig.update_layout(title_text='# of EVs Registered Each Month In 2017 to 2023')
fig.show()
fig = go.Figure()
OBSOLETE Now that December 2023 is fixed, we can see that there is still a drastic difference in the change of EV registration counts between 2022 and 2023. With all the information now correct, we can make a decision on the predictor we'd like to use. In this case, since how much new EVs is being registered is slowing down, it can be safe to assume we can use parabolic regression to predict the amount of new EVs we'll get per year. We can see this also if we plot it out. For our regression, we will now assume "years since 2017", i.e. 2018 = 1 for 1 year since 2017.
Given that the rate of change is parabolic, we can expect the cumulative sum to be logarithmic.
# from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit
def exp_func(X, a, b):
'''Exponential Function y = ae^{xb}'''
return a * np.exp(b * X)
# Store X and Y and transform if needed
X = np.array([x - 2017 for x in cum_year_count_fix.keys()])
Y = np.array(list(cum_year_count_fix.values()))
# Fit a curve to our X and Y using the exponential fucntion
popt, cov = curve_fit(exp_func, X, Y)
# Extract a and b to use in exp_func
a, b = popt
from sklearn.metrics import mean_absolute_percentage_error
Y_test = exp_func(X, a, b)
mape = mean_absolute_percentage_error(Y, Y_test)
print(f'MAPE: {mape} ({mape * 100}%)')
MAPE: 0.12740173731921342 (12.740173731921342%)
# Create prediction X's and generate predicted cumulative registered EVs up to 2028
X_pred = np.array([[x - 2017] for x in range(2024, 2027)])
# poly_features_pred = PolynomialFeatures(degree=2).fit_transform(X_pred)
# Y_pred = model.predict(poly_features_pred)
Y_pred = exp_func(X_pred, a, b)
# Append predicted years to original fixed cum. count
cum_year_count_pred = deepcopy(cum_year_count_fix)
cum_year_count_pred.update({year: round(y_p[0]) for year, y_p in zip(range(2024, 2027), Y_pred)})
# Plot predicted graph
fig = go.Figure()
fig.add_traces(go.Scatter(x=list(cum_year_count_pred.keys()),
y=list(cum_year_count_pred.values())))
fig.update_layout(title_text='Cum. # of EVs Registered with Washington Up to 2023, With Predictions to 2026')
fig.show()
def compute_county_center(coords):
'''
Compute the center of each county given the list of coords by getting the max and min of the x or y coords
for the county and dividing by 2 to get the center of the min and max of the x or y.
'''
def min_max_county_coords(sum_coords, test = False):
min_x = np.inf
max_x = -np.inf
min_y = np.inf
max_y = -np.inf
for coord in sum_coords:
if isinstance(coord[0], list):
mm_coords = min_max_county_coords(coord, True) # recursively get min_max of nested coord lists(due to islands)
min_x = min(min_x, mm_coords[0][0])
max_x = max(max_x, mm_coords[0][1])
min_y = min(min_y, mm_coords[1][0])
max_y = max(max_y, mm_coords[1][1])
else:
min_x = min(min_x, coord[0])
max_x = max(max_x, coord[0])
min_y = min(min_y, coord[1])
max_y = max(max_y, coord[1])
return [[min_x, max_x], [min_y, max_y]]
assert isinstance(coords, list)
mm_coords = min_max_county_coords(coords)
return [sum(mm_coords[0]) / 2, sum(mm_coords[1]) / 2]
def init_map(wa_county_outlines, **kwargs):
'''
Initializes a folium map with basic highlighting and WA outlines and tooltip. Other features to map can be passed in.
'''
if 'highlight_function' not in kwargs:
kwargs['highlight_function'] = lambda county: {'fillColor': '#000000', 'fillOpacity': 0.20}
if 'tooltip' not in kwargs:
kwargs['tooltip'] = folium.features.GeoJsonTooltip(fields=['NAME'], aliases=['County'])
# Initialize County Map
new_map = folium.Map([47.751076, -120.740135], zoom_start=6.5)
map_features = folium.features.GeoJson(
data=wa_county_outlines,
**kwargs
)
# Add Map Features
new_map.add_child(map_features)
return new_map
Heatmap
from folium.plugins import HeatMap
heatmap = init_map(wa_county_outlines)
# Create heatmap using vehicle registration locations.
hm_feature = HeatMap([[coord[1], coord[0]] for coord in vehicle_data['Vehicle Location']])
heatmap.add_child(hm_feature)
heatmap
Chropleth
# Linear color map function for chropleth to plot continuous heatmap coloring.
linear_color = cmp.LinearColormap(
['#F2FFDB', '#8B0000'],
vmin=min(total_ev_per_county['Count']), vmax=max(total_ev_per_county['Count']),
caption='Total # of EVs in 2023'
)
# Create choropleth plot of EV Ownership in Seattle per county
ev_own_choro = init_map(wa_county_outlines,
style_function=lambda county: {
'fillColor': linear_color(total_ev_per_county.loc[county['properties']['NAME']]['Count']),
'fillOpacity': 0.9}
)
# Add choropleth properties
ev_own_choro.add_child(linear_color)
# Add count markers on each county
for county in wa_county_outlines['features']:
center_coord = compute_county_center(county['geometry']['coordinates'])
county_name = county['properties']['NAME']
county_ev_count = total_ev_per_county.loc[county_name]['Count']
folium.Marker(location=[center_coord[1], center_coord[0]], icon=folium.DivIcon(
f"<div style='font-size: 15pt'>{county_ev_count}</div>")
).add_to(ev_own_choro)
ev_own_choro
Top Brands (Icons are clickable and provide more information)
num_logos_ = 2
num_top_ = 3
# Group counties to count car brands
county_dfs = {county: county_df for county, county_df in vehicle_data.groupby(by=['County'])}
# Generate folium map with top brand information
brand_map = init_map(wa_county_outlines)
for county in wa_county_outlines['features']:
# Get coords and county name
center_coord = compute_county_center(county['geometry']['coordinates'])
county_name = county['properties']['NAME']
# Get top brands up to the top `num_top_` (if allowed)
tops = dict(county_dfs[county_name]['Make'].value_counts()[:num_top_])
top_brands = [list(tops.keys()), list(tops.values())]
tops_html = f'<h4>{county_name} County\'s Top {len(top_brands[0])} EV Brands</h4>\n<ul>\n'
logo_html = ''
for i in range(0, len(top_brands[0])):
tops_html += f'<li style="list-style-type: none">{top_brands[0][i]}: {top_brands[1][i]}</li>\n'
if i < num_logos_:
logo_html += f'<img src="data/Car_Make_Logos/{top_brands[0][i].lower()}.svg" alt="{top_brands[0][0].lower()}">'
tops_html += '</ul>\n'
iframe = folium.IFrame(html=tops_html, width=200, height=150)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(location=[center_coord[1], center_coord[0]], popup=popup,
icon=folium.DivIcon(logo_html)
).add_to(brand_map)
folium.TileLayer('cartodbpositron').add_to(brand_map)
brand_map